install.packages("MatchIt")
install.packages("gridExtra")
library(MatchIt)
library(dplyr)
library(ggplot2)

master <- read.csv(file = 'PPMI_Data.csv')

master %>%
    group_by(COHORT) %>%
    summarise(n_patients = n(),
              mean_math = mean(TIMETOLEVO_YR),
              std_error = sd(TIMETOLEVO_YR) / sqrt(n_patients))


master %>%
  mutate(test = (TIMETOLEVO_YR - mean(TIMETOLEVO_YR)) / sd(TIMETOLEVO_YR)) %>% #this is how the time to LD is standardized
  group_by(COHORT) %>%
  summarise(mean_math = mean(test))  
with(master, t.test(TIMETOLEVO_YR ~ COHORT))

master_cov <- c('AGE', 'GENDER', 'RACE', 'SMOKING', 'CAFFEINE', 'ALCOHOL','HEAD')

master %>%
  group_by(COHORT) %>%
  select(one_of(master_cov)) %>%
  summarise_all(funs(mean(., na.rm = T)))

lapply(master_cov, function(v) {
  t.test(master[, v] ~ master[, 'COHORT'])
})

cor(master$AGE, master$TIMETOLEVO)

#For better understanding of code above:
with(master, t.test(AGE ~ COHORT))
with(master, t.test(GENDER ~ COHORT))
with(master, t.test(RACE ~ COHORT))
with(master, t.test(SMOKING ~ COHORT))
with(master, t.test(CAFFEINE ~ COHORT))
with(master, t.test(ALCOHOL ~ COHORT))
with(master, t.test(HEAD ~ COHORT))

m_ps <- glm(COHORT ~ AGE + GENDER + RACE + SMOKING + CAFFEINE + ALCOHOL + HEAD,
            family = binomial(), data = master)
summary(m_ps)

prs_df <- data.frame(pr_score = predict(m_ps, type = "response"),
                     COHORT = m_ps$model$COHORT)
head(prs_df)

labs <- paste("Patient taking RAS Compound:", c("YES", "NO"))
prs_df %>%
  mutate(COHORT = ifelse(COHORT == 1, labs[1], labs[2])) %>%
  ggplot(aes(x = pr_score)) +
  geom_histogram(color = "white") +
  facet_wrap(~COHORT) +
  xlab("Probability of taking RAS Compound") +
  theme_bw()

master_nomiss <- master %>%  # MatchIt does not allow missing values
  select(TIMETOLEVO_YR, COHORT, one_of(master_cov)) %>%
  na.omit()

mod_match <- matchit(COHORT ~ AGE + GENDER + RACE,
                     method = "nearest", data = master_nomiss)





dta_m <- match.data(mod_match)
dim(dta_m)




fn_bal <- function(dta, variable) {
  dta$variable <- dta[, variable]
  dta$COHORT <- as.factor(dta$COHORT)
  support <- c(min(dta$variable), max(dta$variable))
  ggplot(dta, aes(x = distance, y = variable, color = COHORT)) +
    geom_point(alpha = 0.2, size = 1.3) +
    geom_smooth(method = "loess", se = F) +
    xlab("Propensity score") +
    ylab(variable) +
    theme_bw() +
    ylim(support)
}

library(gridExtra)
grid.arrange(
  fn_bal(dta_m, "AGE"),
  fn_bal(dta_m, "GENDER") + theme(legend.position = "none"),
  fn_bal(dta_m, "RACE"),
  fn_bal(dta_m, "SMOKING") + theme(legend.position = "none"),
  fn_bal(dta_m, "CAFFEINE"),
  fn_bal(dta_m, "ALCOHOL") + theme(legend.position = "none"),
  fn_bal(dta_m, "HEAD"),
  nrow = 4, widths = c(1, 0.8)
)

dta_m %>%
  group_by(COHORT) %>%
  select(one_of(master_cov)) %>%
  summarise_all(funs(mean))

write.table(dta_m, "dta_m.txt", sep="\t")


# test: should not be able to reject the null hypothesis of no mean difference for each covariate:
lapply(master_cov, function(v) {
  t.test(dta_m[, v] ~ dta_m$COHORT)
})




#Treatment effects with matched sample:
with(dta_m, t.test(TIMETOLEVO_YR ~ COHORT))

#OLS with covariates
lm_treat2 <- lm(TIMETOLEVO_YR ~ COHORT + AGE + RACE +
                  CAFFEINE + HEAD + GENDER + SMOKING + ALCOHOL, data = dta_m)
summary(lm_treat2)



#OLS without covariates
lm_treat1 <- lm(TIMETOLEVO_YR ~ COHORT, data = dta_m)
summary(lm_treat1)
